library(tidyverse)
library(reticulate)

# Replace this with your conda environment containking sklearn, pandas, & numpy
use_condaenv("py3.8", required = TRUE)

R

Data Ingest - R

hr_data_tbl <- read_csv("data/HRDataset_v13.csv")

hr_data_tbl
## # A tibble: 401 x 35
##    Employee_Name  EmpID MarriedID MaritalStatusID GenderID EmpStatusID DeptID
##    <chr>          <dbl>     <dbl>           <dbl>    <dbl>       <dbl>  <dbl>
##  1 Brown, Mia    1.10e9         1               1        0           1      1
##  2 LaRotonda, W… 1.11e9         0               2        1           1      1
##  3 Steans, Tyro… 1.30e9         0               0        1           1      1
##  4 Howard, Este… 1.21e9         1               1        0           1      1
##  5 Singh, Nan    1.31e9         0               0        0           1      1
##  6 Smith, Leigh… 7.11e8         1               1        0           5      1
##  7 Bunbury, Jes… 1.50e9         1               1        0           5      6
##  8 Carter, Mich… 1.40e9         0               0        0           1      6
##  9 Dietrich, Je… 1.41e9         0               0        0           1      6
## 10 Digitale, Al… 1.31e9         1               1        1           1      6
## # … with 391 more rows, and 28 more variables: PerfScoreID <dbl>,
## #   FromDiversityJobFairID <dbl>, PayRate <dbl>, Termd <dbl>, PositionID <dbl>,
## #   Position <chr>, State <chr>, Zip <chr>, DOB <chr>, Sex <chr>,
## #   MaritalDesc <chr>, CitizenDesc <chr>, HispanicLatino <chr>, RaceDesc <chr>,
## #   DateofHire <chr>, DateofTermination <chr>, TermReason <chr>,
## #   EmploymentStatus <chr>, Department <chr>, ManagerName <chr>,
## #   ManagerID <dbl>, RecruitmentSource <chr>, PerformanceScore <chr>,
## #   EngagementSurvey <dbl>, EmpSatisfaction <dbl>, SpecialProjectsCount <dbl>,
## #   LastPerformanceReview_Date <chr>, DaysLateLast30 <dbl>
hr_data_tbl %>% glimpse()
## Rows: 401
## Columns: 35
## $ Employee_Name              <chr> "Brown, Mia", "LaRotonda, William", "Stean…
## $ EmpID                      <dbl> 1103024456, 1106026572, 1302053333, 121105…
## $ MarriedID                  <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ MaritalStatusID            <dbl> 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 4, 1, 0, …
## $ GenderID                   <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, …
## $ EmpStatusID                <dbl> 1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 4, 1, 5, …
## $ DeptID                     <dbl> 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ PerfScoreID                <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, …
## $ FromDiversityJobFairID     <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ PayRate                    <dbl> 28.50, 23.00, 29.00, 21.50, 16.56, 20.50, …
## $ Termd                      <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, …
## $ PositionID                 <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Position                   <chr> "Accountant I", "Accountant I", "Accountan…
## $ State                      <chr> "MA", "MA", "MA", "MA", "MA", "MA", "VA", …
## $ Zip                        <chr> "01450", "01460", "02703", "02170", "02330…
## $ DOB                        <chr> "11/24/87", "04/26/84", "09/01/86", "09/16…
## $ Sex                        <chr> "F", "M", "M", "F", "F", "F", "F", "F", "F…
## $ MaritalDesc                <chr> "Married", "Divorced", "Single", "Married"…
## $ CitizenDesc                <chr> "US Citizen", "US Citizen", "US Citizen", …
## $ HispanicLatino             <chr> "No", "No", "No", "No", "No", "No", "No", …
## $ RaceDesc                   <chr> "Black or African American", "Black or Afr…
## $ DateofHire                 <chr> "10/27/2008", "1/6/2014", "9/29/2014", "2/…
## $ DateofTermination          <chr> NA, NA, NA, "04/15/15", NA, "09/25/13", "0…
## $ TermReason                 <chr> "N/A - still employed", "N/A - still emplo…
## $ EmploymentStatus           <chr> "Active", "Active", "Active", "Terminated …
## $ Department                 <chr> "Admin Offices", "Admin Offices", "Admin O…
## $ ManagerName                <chr> "Brandon R. LeBlanc", "Brandon R. LeBlanc"…
## $ ManagerID                  <dbl> 1, 1, 1, 1, 1, 1, 17, 17, 17, 17, 17, 17, …
## $ RecruitmentSource          <chr> "Diversity Job Fair", "Website Banner Ads"…
## $ PerformanceScore           <chr> "Fully Meets", "Fully Meets", "Fully Meets…
## $ EngagementSurvey           <dbl> 2.04, 5.00, 3.90, 3.24, 5.00, 3.80, 3.14, …
## $ EmpSatisfaction            <dbl> 2, 4, 5, 3, 3, 4, 5, 5, 1, 5, 5, 4, 3, 5, …
## $ SpecialProjectsCount       <dbl> 6, 4, 5, 4, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ LastPerformanceReview_Date <chr> "1/15/2019", "1/17/2019", "1/18/2019", NA,…
## $ DaysLateLast30             <dbl> 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, 0, NA, 0,…

Subset & Fix Missingness -R

library(DataExplorer)
hr_data_tbl %>% plot_missing()

hr_subset_tbl <- hr_data_tbl %>%
    select(
        # Employee
        Employee_Name, GenderID, MaritalStatusID, 
        # Pay
        PayRate,
        # Department
         Department # ManagerName,
        # # Performance
        # PerformanceScore, SpecialProjectsCount,
        # # Surveys
        # EngagementSurvey, EmpSatisfaction
    ) %>%
    drop_na()

hr_subset_tbl %>% glimpse()
## Rows: 310
## Columns: 5
## $ Employee_Name   <chr> "Brown, Mia", "LaRotonda, William", "Steans, Tyrone",…
## $ GenderID        <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,…
## $ MaritalStatusID <dbl> 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 4, 1, 0, 3, 0, 0, 1,…
## $ PayRate         <dbl> 28.50, 23.00, 29.00, 21.50, 16.56, 20.50, 55.00, 55.0…
## $ Department      <chr> "Admin Offices", "Admin Offices", "Admin Offices", "A…
hr_subset_tbl %>% plot_missing()

Preprocessing - R

library(recipes)

rec_obj <- recipe(~ ., hr_subset_tbl) %>%
    step_rm(Employee_Name) %>%
    step_mutate_at(GenderID, MaritalStatusID, fn = as.factor) %>%
    step_log(PayRate) %>%
    step_dummy(all_nominal()) %>%
    prep()
    
rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##  predictor          5
## 
## Training data contained 310 data points and no missing data.
## 
## Operations:
## 
## Variables removed Employee_Name [trained]
## Variable mutation for GenderID, MaritalStatusID [trained]
## Log transformation on PayRate [trained]
## Dummy variables from GenderID, MaritalStatusID, Department [trained]
hr_subset_processed_tbl <- juice(rec_obj)

hr_subset_processed_tbl %>% glimpse()
## Rows: 310
## Columns: 11
## $ PayRate                         <dbl> 3.349904, 3.135494, 3.367296, 3.06805…
## $ GenderID_X1                     <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1…
## $ MaritalStatusID_X1              <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1…
## $ MaritalStatusID_X2              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MaritalStatusID_X3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MaritalStatusID_X4              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Department_Executive.Office     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_IT.IS                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_Production           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_Sales                <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ Department_Software.Engineering <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
# Prep for Python
X              <- as.matrix(hr_subset_processed_tbl)
employee_names <- hr_subset_tbl$Employee_Name

Python

Clustering - Python

# Data Manipulation
import pandas as pd
import numpy as np

R to Python

r.X
## array([[3.34990409, 0.        , 1.        , ..., 0.        , 0.        ,
##         0.        ],
##        [3.13549422, 1.        , 0.        , ..., 0.        , 0.        ,
##         0.        ],
##        [3.36729583, 1.        , 0.        , ..., 0.        , 0.        ,
##         0.        ],
##        ...,
##        [3.98527347, 1.        , 1.        , ..., 0.        , 0.        ,
##         0.        ],
##        [3.97029191, 1.        , 0.        , ..., 0.        , 0.        ,
##         0.        ],
##        [4.01096295, 0.        , 0.        , ..., 0.        , 0.        ,
##         0.        ]])
pd.Series(r.employee_names)
## 0                Brown, Mia
## 1        LaRotonda, William
## 2            Steans, Tyrone
## 3           Howard, Estelle
## 4                Singh, Nan
##                ...         
## 305            Daniele, Ann
## 306          Lajiri,  Jyoti
## 307    Semizoglou, Jeremiah
## 308              South, Joe
## 309         Warfield, Sarah
## Length: 310, dtype: object

Affinity Propagation

# Machine Learning
from sklearn.cluster import AffinityPropagation
af = AffinityPropagation().fit(r.X)
af
## AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
##                     damping=0.5, max_iter=200, preference=None, verbose=False)
af.cluster_centers_indices_
## array([  7,  10,  12,  18,  35,  40,  45,  48,  78,  83,  94,  97, 114,
##        137, 179, 188, 190, 192, 224, 235, 286, 293, 297, 300])
cluster_assignments_af = af.labels_
cluster_assignments_af
## array([23, 17, 19, 23, 13, 23,  3,  0,  0,  2,  1,  0,  2,  1,  1,  1,  1,
##         3,  3,  1,  2,  0,  1,  0,  2,  1,  1,  2,  0,  0,  3,  1,  0,  7,
##        20,  4,  4,  4,  5,  5,  5,  5,  4,  5,  7,  6,  4,  7,  7,  4,  6,
##         4,  5, 18,  3, 20, 20,  6,  4, 20, 20,  5,  5,  7,  6,  4,  7,  5,
##        20,  7,  7,  4,  4, 23,  8, 19, 19, 18,  8, 11, 11,  9, 19,  9,  9,
##         8,  9,  9, 18, 16, 13, 11, 11, 13, 10, 10, 12, 11, 18, 11, 10, 13,
##        11, 18, 10, 11, 10, 10, 17, 13, 11, 10, 14, 11, 12, 10, 15, 18, 10,
##        10, 16, 13, 16, 11, 13, 11, 10, 13, 16, 14, 15, 18, 12, 10, 18, 13,
##        10, 13, 15, 11, 10, 10, 15, 10, 17, 10, 13, 13, 16, 16, 10, 16, 11,
##        18, 15, 14, 16, 13, 11, 16, 13, 11, 10, 12, 15, 18, 16, 18, 13, 16,
##        16, 13, 11, 10, 10, 18, 15, 13, 11, 14, 16, 10, 16, 10, 15, 13, 10,
##        13, 15, 15, 16, 15, 17, 16, 17, 13, 13, 16, 10, 10, 11, 16, 10, 13,
##        16, 17, 16, 11, 13, 15, 16, 16, 11, 10, 11, 13, 10, 13, 16, 19, 10,
##        13, 10, 19, 18, 19, 15, 10,  9, 18, 10, 11, 18, 19, 10, 19, 11, 10,
##        15, 11, 15, 18, 18, 10, 18, 18, 18, 16,  9, 10, 14, 19, 18, 11, 19,
##        11, 11, 10, 15, 18,  9, 16,  9, 19, 14, 11, 18, 11, 19, 18, 14,  9,
##        18, 19, 18, 16, 18, 14, 10, 10, 12,  0,  0,  1,  5, 20, 20,  2,  2,
##        21, 21, 21, 21, 21, 22, 21, 22, 22, 22, 23, 23,  4,  5, 20,  5,  7,
##         4,  4, 20,  5])

DBSCAN

from sklearn.cluster import DBSCAN
db = DBSCAN(min_samples=5).fit(r.X)
db
## DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
##        metric_params=None, min_samples=5, n_jobs=None, p=None)
cluster_assignments_db = db.labels_
cluster_assignments_db
## array([ 0, -1, -1,  0, -1,  0,  1,  2,  2,  3,  4, -1,  3,  4, -1,  4,  4,
##         1,  1,  4,  3,  2,  4,  2,  3,  4,  4,  3,  2,  2,  1, -1,  2,  5,
##         6,  7,  7,  7,  8,  8,  8,  8,  7,  8,  5, -1,  7,  5,  5,  7, -1,
##         7,  8, -1,  1,  6,  6, -1,  7,  6,  6,  8,  8,  5, -1,  7,  5, -1,
##         6,  5,  5,  7,  7, -1, -1, -1, -1, -1, -1,  9,  9, 10, -1, 10, 10,
##        -1, 10, 10, 11, 12, 11,  9,  9, 11, 10, 10, -1,  9, 11,  9, 10, 11,
##         9, 11, 10,  9, 10, 10, 13, 11,  9, 10, 14,  9, -1, 10, 15, 11, 10,
##        10, 12, 11, 12,  9, 11,  9, 10, 11, 12, 14, 15, 11, -1, 10, 11, 11,
##        10, 11, 15,  9, 10, 10, 15, 10, 13, 10, 11, 11, 12, 12, 10, 12,  9,
##        11, 15, 14, 12, 11,  9, 12, 11,  9, 10, -1, 15, 11, 12, 11, 11, 12,
##        12, 11,  9, 10, 10, 11, 15, 11,  9, 14, 12, 10, 12, 10, 15, 11, 10,
##        11, 15, 15, 12, 15, 13, 12, 13, 11, 11, 12, 10, 10,  9, 12, 10, 11,
##        12, 13, 12,  9, 11, 15, 12, 12,  9, 10,  9, 11, 10, 11, 12, 12, 10,
##        11, 10, 12, 11, 12, 15, 10, 10, 11, 10,  9, 11, 12, 10, 12,  9, 10,
##        15,  9, 15, 11, 11, 10, 11, 11, 11, 12, 10, 10, 14, 12, 11,  9, 12,
##         9,  9, 10, 15, 11, 10, 12, 10, 12, 14,  9, 11,  9, 12, 11, 14, 10,
##        -1, -1, 11, 12, 11, 14, 10, 10, -1,  2,  2, -1,  8,  6,  6, -1, -1,
##        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  7,  8,  6,  8,  5,
##         7,  7,  6, -1])

TSNE Low-Dimensional Embedding - Python

Needed to create a reduced representation of the original data in 2-D space.

from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, random_state=123).fit_transform(r.X)

pd.DataFrame(X_embedded)
##              0          1
## 0    10.122051  11.739521
## 1    -7.448895  -7.267364
## 2    -7.535513  -1.628443
## 3    10.195205  11.912298
## 4    -5.360659 -20.355145
## ..         ...        ...
## 305  13.353748   6.259142
## 306  11.660574   3.584381
## 307  11.434317   3.715972
## 308  14.006830   2.334109
## 309  15.658013   5.544502
## 
## [310 rows x 2 columns]

Py to R

Getting Scikit-Learn Results in RMarkdown

# Affinity Propogation
py$cluster_assignments_af
##   [1] 23 17 19 23 13 23  3  0  0  2  1  0  2  1  1  1  1  3  3  1  2  0  1  0  2
##  [26]  1  1  2  0  0  3  1  0  7 20  4  4  4  5  5  5  5  4  5  7  6  4  7  7  4
##  [51]  6  4  5 18  3 20 20  6  4 20 20  5  5  7  6  4  7  5 20  7  7  4  4 23  8
##  [76] 19 19 18  8 11 11  9 19  9  9  8  9  9 18 16 13 11 11 13 10 10 12 11 18 11
## [101] 10 13 11 18 10 11 10 10 17 13 11 10 14 11 12 10 15 18 10 10 16 13 16 11 13
## [126] 11 10 13 16 14 15 18 12 10 18 13 10 13 15 11 10 10 15 10 17 10 13 13 16 16
## [151] 10 16 11 18 15 14 16 13 11 16 13 11 10 12 15 18 16 18 13 16 16 13 11 10 10
## [176] 18 15 13 11 14 16 10 16 10 15 13 10 13 15 15 16 15 17 16 17 13 13 16 10 10
## [201] 11 16 10 13 16 17 16 11 13 15 16 16 11 10 11 13 10 13 16 19 10 13 10 19 18
## [226] 19 15 10  9 18 10 11 18 19 10 19 11 10 15 11 15 18 18 10 18 18 18 16  9 10
## [251] 14 19 18 11 19 11 11 10 15 18  9 16  9 19 14 11 18 11 19 18 14  9 18 19 18
## [276] 16 18 14 10 10 12  0  0  1  5 20 20  2  2 21 21 21 21 21 22 21 22 22 22 23
## [301] 23  4  5 20  5  7  4  4 20  5
# DBSCAN
py$cluster_assignments_db
##   [1]  0 -1 -1  0 -1  0  1  2  2  3  4 -1  3  4 -1  4  4  1  1  4  3  2  4  2  3
##  [26]  4  4  3  2  2  1 -1  2  5  6  7  7  7  8  8  8  8  7  8  5 -1  7  5  5  7
##  [51] -1  7  8 -1  1  6  6 -1  7  6  6  8  8  5 -1  7  5 -1  6  5  5  7  7 -1 -1
##  [76] -1 -1 -1 -1  9  9 10 -1 10 10 -1 10 10 11 12 11  9  9 11 10 10 -1  9 11  9
## [101] 10 11  9 11 10  9 10 10 13 11  9 10 14  9 -1 10 15 11 10 10 12 11 12  9 11
## [126]  9 10 11 12 14 15 11 -1 10 11 11 10 11 15  9 10 10 15 10 13 10 11 11 12 12
## [151] 10 12  9 11 15 14 12 11  9 12 11  9 10 -1 15 11 12 11 11 12 12 11  9 10 10
## [176] 11 15 11  9 14 12 10 12 10 15 11 10 11 15 15 12 15 13 12 13 11 11 12 10 10
## [201]  9 12 10 11 12 13 12  9 11 15 12 12  9 10  9 11 10 11 12 12 10 11 10 12 11
## [226] 12 15 10 10 11 10  9 11 12 10 12  9 10 15  9 15 11 11 10 11 11 11 12 10 10
## [251] 14 12 11  9 12  9  9 10 15 11 10 12 10 12 14  9 11  9 12 11 14 10 -1 -1 11
## [276] 12 11 14 10 10 -1  2  2 -1  8  6  6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0
## [301]  0  7  8  6  8  5  7  7  6 -1
X_embedded_tbl <- py$X_embedded %>% as_tibble()
X_embedded_tbl
## # A tibble: 310 x 2
##       V1     V2
##    <dbl>  <dbl>
##  1 10.1   11.7 
##  2 -7.45  -7.27
##  3 -7.54  -1.63
##  4 10.2   11.9 
##  5 -5.36 -20.4 
##  6 10.2   11.9 
##  7  5.33   5.02
##  8  3.53   4.29
##  9  3.53   4.29
## 10  6.15   3.28
## # … with 300 more rows

R Visualizations

library(plotly)
library(tidyquant)

Data Preparation

employee_clustering_tbl <- tibble(
    Employee_Name = employee_names,
    cluster_af    = py$cluster_assignments_af,
    cluster_db    = py$cluster_assignments_db,
) %>%
    bind_cols(X_embedded_tbl) %>%
    left_join(hr_data_tbl)

employee_clustering_tbl
## # A tibble: 310 x 39
##    Employee_Name cluster_af cluster_db    V1     V2  EmpID MarriedID
##    <chr>              <dbl>      <dbl> <dbl>  <dbl>  <dbl>     <dbl>
##  1 Brown, Mia            23          0 10.1   11.7  1.10e9         1
##  2 LaRotonda, W…         17         -1 -7.45  -7.27 1.11e9         0
##  3 Steans, Tyro…         19         -1 -7.54  -1.63 1.30e9         0
##  4 Howard, Este…         23          0 10.2   11.9  1.21e9         1
##  5 Singh, Nan            13         -1 -5.36 -20.4  1.31e9         0
##  6 Smith, Leigh…         23          0 10.2   11.9  7.11e8         1
##  7 Bunbury, Jes…          3          1  5.33   5.02 1.50e9         1
##  8 Carter, Mich…          0          2  3.53   4.29 1.40e9         0
##  9 Dietrich, Je…          0          2  3.53   4.29 1.41e9         0
## 10 Digitale, Al…          2          3  6.15   3.28 1.31e9         1
## # … with 300 more rows, and 32 more variables: MaritalStatusID <dbl>,
## #   GenderID <dbl>, EmpStatusID <dbl>, DeptID <dbl>, PerfScoreID <dbl>,
## #   FromDiversityJobFairID <dbl>, PayRate <dbl>, Termd <dbl>, PositionID <dbl>,
## #   Position <chr>, State <chr>, Zip <chr>, DOB <chr>, Sex <chr>,
## #   MaritalDesc <chr>, CitizenDesc <chr>, HispanicLatino <chr>, RaceDesc <chr>,
## #   DateofHire <chr>, DateofTermination <chr>, TermReason <chr>,
## #   EmploymentStatus <chr>, Department <chr>, ManagerName <chr>,
## #   ManagerID <dbl>, RecruitmentSource <chr>, PerformanceScore <chr>,
## #   EngagementSurvey <dbl>, EmpSatisfaction <dbl>, SpecialProjectsCount <dbl>,
## #   LastPerformanceReview_Date <chr>, DaysLateLast30 <dbl>
termination_rate_tbl <- employee_clustering_tbl %>%
    select(cluster_db, Termd) %>%
    group_by(cluster_db) %>%
    summarise(
        term_rate  = sum(Termd) / length(Termd),
        term_count = n()
    ) %>%
    arrange(desc(term_rate))

termination_rate_tbl
## # A tibble: 17 x 3
##    cluster_db term_rate term_count
##         <dbl>     <dbl>      <int>
##  1         13     0.8            5
##  2         15     0.625         16
##  3          9     0.467         30
##  4         10     0.46          50
##  5          0     0.4            5
##  6         11     0.327         49
##  7         12     0.314         35
##  8          7     0.286         14
##  9          8     0.273         11
## 10         -1     0.268         41
## 11          1     0.2            5
## 12          4     0.125          8
## 13         14     0.125          8
## 14          2     0.111          9
## 15          6     0.1           10
## 16          3     0              5
## 17          5     0              9

Termination by Cluster Visualization

g <- termination_rate_tbl %>%
    mutate(cluster_db = as_factor(cluster_db) %>% fct_reorder(term_count)) %>%
    ggplot(aes(term_count, cluster_db)) +
    geom_col(aes(fill = term_rate)) +
    theme_tq() +
    labs(title = "Termination Rate by Employee Cluster",
         fill = "Term. Rate", x = "Termination Count", y = "Cluster Assignment")

ggplotly(g)

Cluster Network Visualization - R

g <- employee_clustering_tbl %>%
    left_join(termination_rate_tbl) %>%
    mutate(description = str_glue("{Employee_Name}
                                  Position = {Position}
                                  MaritalDesc = {MaritalDesc}
                                  Sex = {Sex}
                                  Race = {RaceDesc}
                                  EmpStatusID = {EmpStatusID}
                                  PayRate = {PayRate}
                                  Terminated = {Termd}
                                  Term Reason = {TermReason}
                                  
                                  Cluster Term Rate: {scales::percent(term_rate)}
                                  Cluster Term Count: {term_count}
                                  
                                  ")
    ) %>%
    select(Employee_Name:V2, description, Termd, 
           term_rate, term_count) %>%
    
    ggplot(aes(V1, V2, color = factor(cluster_db))) +
    geom_point(aes(text = description, size = term_rate), alpha = 0.5) +
    scale_color_tq() +
    theme_tq() +
    # theme(legend.position = "none") + 
    labs(title = "Employee Cluster Assignments", color = "Cluster")
    

ggplotly(g)

Shiny

Sourcing Python

# Bonus #1!!!
source_python("py/clustering.py")
source_python("py/tsne.py")
# Calls def cluster_dbscan(...) 
cluster_dbscan(X)
##   [1]  0 -1 -1  0 -1  0  1  2  2  3  4 -1  3  4 -1  4  4  1  1  4  3  2  4  2  3
##  [26]  4  4  3  2  2  1 -1  2  5  6  7  7  7  8  8  8  8  7  8  5 -1  7  5  5  7
##  [51] -1  7  8 -1  1  6  6 -1  7  6  6  8  8  5 -1  7  5 -1  6  5  5  7  7 -1 -1
##  [76] -1 -1 -1 -1  9  9 10 -1 10 10 -1 10 10 11 12 11  9  9 11 10 10 -1  9 11  9
## [101] 10 11  9 11 10  9 10 10 13 11  9 10 14  9 -1 10 15 11 10 10 12 11 12  9 11
## [126]  9 10 11 12 14 15 11 -1 10 11 11 10 11 15  9 10 10 15 10 13 10 11 11 12 12
## [151] 10 12  9 11 15 14 12 11  9 12 11  9 10 -1 15 11 12 11 11 12 12 11  9 10 10
## [176] 11 15 11  9 14 12 10 12 10 15 11 10 11 15 15 12 15 13 12 13 11 11 12 10 10
## [201]  9 12 10 11 12 13 12  9 11 15 12 12  9 10  9 11 10 11 12 12 10 11 10 12 11
## [226] 12 15 10 10 11 10  9 11 12 10 12  9 10 15  9 15 11 11 10 11 11 11 12 10 10
## [251] 14 12 11  9 12  9  9 10 15 11 10 12 10 12 14  9 11  9 12 11 14 10 -1 -1 11
## [276] 12 11 14 10 10 -1  2  2 -1  8  6  6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0
## [301]  0  7  8  6  8  5  7  7  6 -1
# calls def tsne_embedding
tsne_embedding(X) %>% head()
##           [,1]       [,2]
## [1,] 10.122051  11.739521
## [2,] -7.448895  -7.267364
## [3,] -7.535513  -1.628443
## [4,] 10.195205  11.912298
## [5,] -5.360659 -20.355145
## [6,] 10.207287  11.939018

Integrate into Shiny App!

# Bonus #2!!!